from email import errors, header
from fractions import Fraction
from re import T, X
from tkinter import Variable
from tokenize import Ignore
from bs4 import BeautifulSoup
import re

import xlsxwriter
import requests 
loop = 1
title = []
finallist = []
amazon = []
noveldata = []

def extract(tags, limit_start, limits_end):
    doc = BeautifulSoup(result.text,"html.parser")
    for sludge in tags:
      n = 0
      x = 0
      noveldata = []
      for limits in limit_start:
        sludge = str(sludge)
        temp = sludge.split(limits)
        final = temp[1].split(limits_end[n])
        noveldata.append(final[0])
        n = n + 1
        x = x + 1
      print(noveldata[0])
      pattern = re.compile(noveldata[0])
      headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
      try:
        url = doc.find(text=pattern).parent.nextSibling.next_element.next_sibling.next_element.get('href')
        print(url)
        result2 = requests.get(url, headers= headers)
        doc2 = BeautifulSoup(result2.text,"html.parser")
        sludge = doc2.get_text()
        sentence = ''.join(sludge.split()) #Credit to https://stackoverflow.com/questions/54065845/problem-in-scraping-data-in-non-english-character-sites-python for this 
        publisherTemp = sentence.split("Publisher")
        publisher = publisherTemp[1].split("Publicationdate") #access with publisher[0]
        noveldata.append(publisher[0])
        pagesTemp =  sentence.split("Printlength")
        pages = pagesTemp[1].split("pages")
        noveldata.append(pages[0])
        asinTemp = sentence.split("ASIN‏:‎")
        asin = asinTemp[1].split("Publisher")
        noveldata.append(asin[0])
        titleTemp = sentence.split("Amazon.co.jp:")
        title = titleTemp[1].split("eBook:")
        noveldata.append(title[0])
        
      except Exception as ex:
        print(ex)
        noveldata.append("error")
      finallist.append(noveldata)
    return 0

url = "https://jpdb.io/novel-difficulty-list"
#while (loop == 1):
  
result = requests.get(url)
doc = BeautifulSoup(result.text, "html.parser")
tags = doc.find_all('div', attrs={
    "style": "margin-bottom: 3rem; margin-right: 1.5rem; display: flex; flex-direction: column; align-items: flex-start;"})
limits_start = ['30rem;">', '(in words)</th><td>', 'Unique words</th><td>', '(used once)</th><td>', '(used once %)</th><td>',
                'Unique kanji</th><td>', 'kanji (used once)</th><td>', 'Unique kanji readings</th><td>', 'Difficulty</th><td>', 'Average sentence length</th><td>', 'Characters</th><td>']
limits_end = ['</h5>', '</td>', '</td>', '</td>', '</td>',
              '</td>', '</td>', '</td>', '/10</td>', '</td>', '</td>']
extract(tags, limits_start, limits_end)
#href = doc.find_all('a')
loop = 0
tags2 = doc.find_all("a")
for tag in tags2:
    if tag.string == "Next page":
        url = "https://jpdb.io" + tag.get('href')
        print(url)
        loop = 1






#titleTemp = sentence.split("Amazon.co.jp:")
#title = titleTemp[1].split("eBook:")
#authorTemp = sentence.split("FollowtheAuthor") #These can cause crashes if the website in question doesn't have an Author listed ...
#author = authorTemp[1].split("Something")
#print(publisher[0])
# Define all of the temp containers 




#global flow control variables 

dummy = T
col = 0
row = 0



    
    
print(len(title))
print(finallist)
#workbook = xlsxwriter.Workbook('C:/Users/Jackson Budwell/Downloads/testscrape/data.xlsx')
#worksheet = workbook.add_worksheet()

#for tag in title:
  #  worksheet.write(row, col, tag)
   # row += 1
#row = 0
#col = 1
#for list in finallist:
#   for tag in list:
#        worksheet.write(row, col, tag)
#        col += 1
#    col = 1
#    row += 1

#workbook.close()